{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 01 Identifying individuals, variables and categorical variables in a data set"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%html\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import plotly.graph_objects as go\n",
"import seaborn as sns\n",
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.1.3,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3 pyspark-shell'\n",
"\n",
"import findspark\n",
"\n",
"findspark.init()\n",
"from pyspark.context import SparkContext\n",
"from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql.types import *\n",
"from pyspark.sql.session import SparkSession\n",
"\n",
"spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import uuid\n",
"from confluent_kafka import Producer\n",
"from ksql import KSQLAPI\n",
"bootstrap_servers='[::1]:9092'\n",
"topic=f'stats0101{str(uuid.uuid4())[:7]}'\n",
"msg_count=5\n",
"client = KSQLAPI(url='http://localhost:8088')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/analyzing-one-categorical-variable/v/identifying-individuals-variables-and-categorical-variables-in-a-data-set?modal=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### dataset"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"dataset = {\n",
" \"Drink\": [\n",
" \"Brewed coffee\",\n",
" \"Caffe latte\",\n",
" \"Caffe mocha\",\n",
" \"Cappuccino\",\n",
" \"Iced brewed coffee\",\n",
" \"Chai latte\",\n",
" ],\n",
" \"Type\": [\"Hot\", \"Hot\", \"Hot\", \"Hot\", \"Cold\", \"Hot\"],\n",
" \"Calories\": [4, 100, 170, 60, 60, 120],\n",
" \"Sugars (g)\": [0, 14, 27, 8, 15, 25],\n",
" \"Caffein (mg)\": [260, 75, 95, 75, 120, 60],\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pandas Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Type | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" Hot | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" Hot | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" Hot | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" Hot | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" Cold | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
"
\n",
" \n",
" Chai latte | \n",
" Hot | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Type Calories Sugars (g) Caffein (mg)\n",
"Drink \n",
"Brewed coffee Hot 4 0 260\n",
"Caffe latte Hot 100 14 75\n",
"Caffe mocha Hot 170 27 95\n",
"Cappuccino Hot 60 8 75\n",
"Iced brewed coffee Cold 60 15 120\n",
"Chai latte Hot 120 25 60"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(dataset).set_index(\"Drink\")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Spark Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+----+--------+----------+------------+\n",
"| Drink|Type|Calories|Sugars (g)|Caffein (mg)|\n",
"+------------------+----+--------+----------+------------+\n",
"| Brewed coffee| Hot| 4| 0| 260|\n",
"| Caffe latte| Hot| 100| 14| 75|\n",
"| Caffe mocha| Hot| 170| 27| 95|\n",
"| Cappuccino| Hot| 60| 8| 75|\n",
"|Iced brewed coffee|Cold| 60| 15| 120|\n",
"| Chai latte| Hot| 120| 25| 60|\n",
"+------------------+----+--------+----------+------------+\n",
"\n"
]
}
],
"source": [
"sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n",
"sdf.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kafka producer"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p = Producer({'bootstrap.servers': bootstrap_servers})\n",
"for i in zip(*dataset.values()):\n",
" x = dict(zip(dataset.keys(), i))\n",
" record_key = str(uuid.uuid4())\n",
" record_value = json.dumps(x)\n",
"\n",
" p.produce(topic, value=record_value)\n",
" p.poll(0)\n",
" \n",
"p.flush() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### KSQL stream"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{\"header\":{\"queryId\":\"transient_STATS0101E908401_8697743092071727740\",\"schema\":\"`Drink` STRING, `Type` STRING, `Calories` BIGINT, `Sugars (g)` BIGINT, `Caffein (mg)` BIGINT\"}},\n",
"\n",
"{\"row\":{\"columns\":[\"Brewed coffee\",\"Hot\",4,0,260]}},\n",
"\n",
"{\"row\":{\"columns\":[\"Caffe latte\",\"Hot\",100,14,75]}},\n",
"\n",
"{\"row\":{\"columns\":[\"Caffe mocha\",\"Hot\",170,27,95]}},\n",
"\n",
"{\"row\":{\"columns\":[\"Cappuccino\",\"Hot\",60,8,75]}},\n",
"\n",
"{\"row\":{\"columns\":[\"Iced brewed coffee\",\"Cold\",60,15,120]}},\n",
"\n",
"{\"row\":{\"columns\":[\"Chai latte\",\"Hot\",120,25,60]}},\n",
"\n",
"]\n",
"\n",
"\n"
]
}
],
"source": [
"client.create_stream(table_name=topic,\n",
" columns_type=['`Drink` string','`Type` string', '`Calories` bigint','`Sugars (g)` bigint', '`Caffein (mg)` bigint'],\n",
" topic=topic,\n",
" value_format='JSON'\n",
" )\n",
"res = client.query(f\"select * from {topic}\")\n",
"while True:\n",
" try:\n",
" print(next(res))\n",
" except RuntimeError:\n",
" print('')\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Spark stream"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### read stream"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- key: binary (nullable = true)\n",
" |-- value: binary (nullable = true)\n",
" |-- topic: string (nullable = true)\n",
" |-- partition: integer (nullable = true)\n",
" |-- offset: long (nullable = true)\n",
" |-- timestamp: timestamp (nullable = true)\n",
" |-- timestampType: integer (nullable = true)\n",
"\n"
]
}
],
"source": [
"df_raw = spark \\\n",
" .readStream \\\n",
" .format('kafka') \\\n",
" .option('kafka.bootstrap.servers', bootstrap_servers) \\\n",
" .option(\"startingOffsets\", \"earliest\") \\\n",
" .option('subscribe', topic) \\\n",
" .load()\n",
"df_raw.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### write stream"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_json = df_raw.selectExpr('CAST(value AS STRING) as json')\n",
"df_json.select(F.from_json(df_json.json, sdf.schema).alias('stream_data')) \\\n",
" .writeStream \\\n",
" .queryName(topic) \\\n",
" .trigger(once=True) \\\n",
" .format(\"memory\") \\\n",
" .start() \\\n",
" .awaitTermination()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------------------------+\n",
"|stream_data |\n",
"+---------------------------------------+\n",
"|{Brewed coffee, Hot, 4, 0, 260} |\n",
"|{Caffe latte, Hot, 100, 14, 75} |\n",
"|{Caffe mocha, Hot, 170, 27, 95} |\n",
"|{Cappuccino, Hot, 60, 8, 75} |\n",
"|{Iced brewed coffee, Cold, 60, 15, 120}|\n",
"|{Chai latte, Hot, 120, 25, 60} |\n",
"+---------------------------------------+\n",
"\n"
]
}
],
"source": [
"spark.sql(f\"select * from {topic}\").show(truncate=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"## Find and Replace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Panads"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Type | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" 1 | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" 1 | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" 1 | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" 1 | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" 0 | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
"
\n",
" \n",
" Chai latte | \n",
" 1 | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Type Calories Sugars (g) Caffein (mg)\n",
"Drink \n",
"Brewed coffee 1 4 0 260\n",
"Caffe latte 1 100 14 75\n",
"Caffe mocha 1 170 27 95\n",
"Cappuccino 1 60 8 75\n",
"Iced brewed coffee 0 60 15 120\n",
"Chai latte 1 120 25 60"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_fr = df.copy()\n",
"df_fr.replace({\"Type\": {\"Hot\": 1, \"Cold\": 0}}, inplace=True)\n",
"df_fr"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Spark"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+----+--------+----------+------------+\n",
"| Drink|Type|Calories|Sugars (g)|Caffein (mg)|\n",
"+------------------+----+--------+----------+------------+\n",
"| Brewed coffee| 1| 4| 0| 260|\n",
"| Caffe latte| 1| 100| 14| 75|\n",
"| Caffe mocha| 1| 170| 27| 95|\n",
"| Cappuccino| 1| 60| 8| 75|\n",
"|Iced brewed coffee| 0| 60| 15| 120|\n",
"| Chai latte| 1| 120| 25| 60|\n",
"+------------------+----+--------+----------+------------+\n",
"\n"
]
}
],
"source": [
"sdf_fr = sdf.withColumn(\n",
" \"Type\", F.when(F.col(\"Type\") == \"Hot\", 1).when(F.col(\"Type\") == \"Cold\", 0)\n",
")\n",
"sdf_fr.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Label Encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pandas"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Type | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
" Type_Hot | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" Hot | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" Hot | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" Hot | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
" 1 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" Hot | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" Cold | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
" 0 | \n",
"
\n",
" \n",
" Chai latte | \n",
" Hot | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Type Calories Sugars (g) Caffein (mg) Type_Hot\n",
"Drink \n",
"Brewed coffee Hot 4 0 260 1\n",
"Caffe latte Hot 100 14 75 1\n",
"Caffe mocha Hot 170 27 95 1\n",
"Cappuccino Hot 60 8 75 1\n",
"Iced brewed coffee Cold 60 15 120 0\n",
"Chai latte Hot 120 25 60 1"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_lc = df.copy()\n",
"df_lc[\"Type_Hot\"] = df_lc[\"Type\"].astype(\"category\").cat.codes\n",
"df_lc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sklearn"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Type | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
" Type_Hot | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" Hot | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" Hot | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" Hot | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
" 1 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" Hot | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" Cold | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
" 0 | \n",
"
\n",
" \n",
" Chai latte | \n",
" Hot | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Type Calories Sugars (g) Caffein (mg) Type_Hot\n",
"Drink \n",
"Brewed coffee Hot 4 0 260 1\n",
"Caffe latte Hot 100 14 75 1\n",
"Caffe mocha Hot 170 27 95 1\n",
"Cappuccino Hot 60 8 75 1\n",
"Iced brewed coffee Cold 60 15 120 0\n",
"Chai latte Hot 120 25 60 1"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_lc = df.copy()\n",
"df_lc[\"Type_Hot\"] = preprocessing.LabelEncoder().fit_transform(df[\"Type\"])\n",
"df_lc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Spark"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+----+--------+----------+------------+---------+\n",
"| Drink|Type|Calories|Sugars (g)|Caffein (mg)|Type_Cold|\n",
"+------------------+----+--------+----------+------------+---------+\n",
"| Brewed coffee| Hot| 4| 0| 260| 0.0|\n",
"| Caffe latte| Hot| 100| 14| 75| 0.0|\n",
"| Caffe mocha| Hot| 170| 27| 95| 0.0|\n",
"| Cappuccino| Hot| 60| 8| 75| 0.0|\n",
"|Iced brewed coffee|Cold| 60| 15| 120| 1.0|\n",
"| Chai latte| Hot| 120| 25| 60| 0.0|\n",
"+------------------+----+--------+----------+------------+---------+\n",
"\n"
]
}
],
"source": [
"sdf_lc = StringIndexer(inputCol=\"Type\", outputCol=\"Type_Cold\").fit(sdf).transform(sdf)\n",
"sdf_lc.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## One-Hot Encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pandas"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
" Type_Cold | \n",
" Type_Hot | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" Chai latte | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Calories Sugars (g) Caffein (mg) Type_Cold Type_Hot\n",
"Drink \n",
"Brewed coffee 4 0 260 0 1\n",
"Caffe latte 100 14 75 0 1\n",
"Caffe mocha 170 27 95 0 1\n",
"Cappuccino 60 8 75 0 1\n",
"Iced brewed coffee 60 15 120 1 0\n",
"Chai latte 120 25 60 0 1"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ohc = pd.get_dummies(df, columns=[\"Type\"], prefix=\"Type\")\n",
"df_ohc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sklearn"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Calories | \n",
" Sugars (g) | \n",
" Caffein (mg) | \n",
" Type_Cold | \n",
" Type_Hot | \n",
"
\n",
" \n",
" Drink | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Brewed coffee | \n",
" 4 | \n",
" 0 | \n",
" 260 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" Caffe latte | \n",
" 100 | \n",
" 14 | \n",
" 75 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" Caffe mocha | \n",
" 170 | \n",
" 27 | \n",
" 95 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" Cappuccino | \n",
" 60 | \n",
" 8 | \n",
" 75 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" Iced brewed coffee | \n",
" 60 | \n",
" 15 | \n",
" 120 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" Chai latte | \n",
" 120 | \n",
" 25 | \n",
" 60 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Calories Sugars (g) Caffein (mg) Type_Cold Type_Hot\n",
"Drink \n",
"Brewed coffee 4 0 260 0.0 1.0\n",
"Caffe latte 100 14 75 0.0 1.0\n",
"Caffe mocha 170 27 95 0.0 1.0\n",
"Cappuccino 60 8 75 0.0 1.0\n",
"Iced brewed coffee 60 15 120 1.0 0.0\n",
"Chai latte 120 25 60 0.0 1.0"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = (\n",
" preprocessing.OneHotEncoder()\n",
" .fit_transform(df[\"Type\"].values.reshape(-1, 1))\n",
" .toarray()\n",
")\n",
"df_ohc = pd.concat(\n",
" [\n",
" df.drop(columns=\"Type\").reset_index(),\n",
" pd.DataFrame(x, columns=[\"Type_Cold\", \"Type_Hot\"]),\n",
" ],\n",
" axis=1,\n",
")\n",
"df_ohc.set_index(\"Drink\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Spark"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+----+--------+----------+------------+---------+-------------+\n",
"| Drink|Type|Calories|Sugars (g)|Caffein (mg)|Type_Cold| Type_Vec|\n",
"+------------------+----+--------+----------+------------+---------+-------------+\n",
"| Brewed coffee| Hot| 4| 0| 260| 0.0|(1,[0],[1.0])|\n",
"| Caffe latte| Hot| 100| 14| 75| 0.0|(1,[0],[1.0])|\n",
"| Caffe mocha| Hot| 170| 27| 95| 0.0|(1,[0],[1.0])|\n",
"| Cappuccino| Hot| 60| 8| 75| 0.0|(1,[0],[1.0])|\n",
"|Iced brewed coffee|Cold| 60| 15| 120| 1.0| (1,[],[])|\n",
"| Chai latte| Hot| 120| 25| 60| 0.0|(1,[0],[1.0])|\n",
"+------------------+----+--------+----------+------------+---------+-------------+\n",
"\n"
]
}
],
"source": [
"sdf_ohc = (\n",
" OneHotEncoder(inputCol=\"Type_Cold\", outputCol=\"Type_Vec\")\n",
" .fit(sdf_lc)\n",
" .transform(sdf_lc)\n",
")\n",
"sdf_ohc.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"toc-autonumbering": true,
"toc-showcode": false,
"toc-showmarkdowntxt": true
},
"nbformat": 4,
"nbformat_minor": 4
}